#!/usr/bin/env python
#coding=utf-8
from module import TAGModule
import platform

class testStaticClass:
    i = 0
    @classmethod
    def judge(self):
        if self.i != 1:
            self.test()

    @classmethod
    def test(self):
        self.i = 1
        global tagWordSeqNoDic
        if platform.system() == 'Windows':
            TagDATPath = "F:\\Laptop-Acer\\Document\\Other\\Python\\MITagger\\MITagger-searchEngine\\resource\\TAG\\tags_fixed.dat"
        elif platform.system() == 'Linux':
            TagDATPath = "F:/Laptop-Acer/Document/Other/Python/MITagger/MITagger-searchEngine/resource/TAG/tags_fixed.dat"
        #TagWordSeqNoDic contains tagSeqNoList and tagNameContent
        tagWordSeqNoDic = TAGModule.TagWordSeqNoDic()
        print "reading DAT file..."
        tagWordSeqNoDic.readDAT(TagDATPath)
        print "DAT file imports finished!"


"""
import re
import os

if __name__ == '__main__':
    i = 0
    testList = [2, 4, 9, 7, 1, 1,6, 5,10]
    while i < len(testList):
        print testList[i]
        
        if testList[i] > 5:
            print "No = " + str(i)
            print "delElement =" + str(testList[i])
            del testList[i]
            i -= 1
        i += 1
        
    print testList
"""
"""
#read Filename in OS
import os  
filenames=os.listdir(os.getcwd())  
for name in filenames:  
    print name
del filenames
"""
"""
global tagIndexList
BASE_DIR = os.path.dirname(os.getcwd())
DATPath = BASE_DIR + "\\resource\\TAG\\tags.dat"
tagIndexList = TAGModule.TagIndexList()
print "reading tags file..."
tagIndexList.readDAT(DATPath)
print "TAG import finished!"
pause = raw_input("pause")
"""
"""
import linecache
filename = "E:\\MITagger\\MITagger-searchEngine\\code\\config\\config_del.ini"
count = linecache.getline(filename,2)
print count
"""



"""
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()


from nltk import SnowballStemmer
stemmer = SnowballStemmer("english")


from nltk.stem import RSLPStemmer
st1 = RSLPStemmer()

from nltk.stem.isri import ISRIStemmer
stI = ISRIStemmer()

from nltk.stem.porter import PorterStemmer
PS = PorterStemmer()


from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()



word = raw_input("word\n")
#print "lancaster: " + st.stem(word)#将后缀全部去掉，不科学
print "snowball: " + stemmer.stem(word)#可以识别s结尾是否为复数
#print "RSLP: " + st1.stem(word)#正确率低
#print "ISRI: " + stI.stem(word)#貌似不是英文
#print "porter: " + PS.stem(word)#一律去掉s结尾
#print "wordnet: " + wnl.lemmatize(word)#时间太长
"""

"""
string = "a-a"
#string = re.sub(r'\W+\s',' ',string) #将后面是空格的标点符号去掉，并且将多个连在一起的标点去掉
#string = re.sub(r'(?<=\s)\W(?=\w)','',string) #将前面是空格后面是字母的标点符号去掉
#string = string[:-1] #去掉最后的句号
#string = re.sub(r'\s\W!\W',' ',string)
#string = re.sub(r'\(','',string)
#string = re.sub(r'\)','',string)
if re.search(r'\W', string):
	print "hehe"
#print string
"""

"""
#分词以及词性判断
###librarys:  
import sys  
  
  
###global variables:  
freqdic={}  
dic={}  
transferdic={}  
inputfilename=''  
outputfilename=''  
trainingfilename=''  
marklist=[]  
  
###classes:  
class nodeinfotable:  
    def __init__(self,No,length,previousdict):  
        self.No=No  
        self.length=length  
        self.previousdict=previousdict  
          
class node:  
    def __init__(self,outpointerdict,tablelist):  
        self.outpointerdict=outpointerdict  
        self.tablelist=tablelist  
    def addOutPointer(self,targetnum,wordname):  
        self.outpointerdict[targetnum]=wordname  
  
  
class wordnode:  
    def __init__(self,wordval,propertynodelist):  
        self.wordval=wordval  
        self.propertynodelist=propertynodelist  
  
class propertynode:  
    def __init__(self,propertyval,probability,beststack):  
        self.propertyval=propertyval  
        self.probability=probability  
        self.beststack=beststack  
  
  
###functions:  
#find cmpstr in valuestr,which is splited by ','  
def findinstr(valuestr,cmpstr):  
    wordlist=valuestr.split(',')  
    for w in wordlist:  
        if cmpstr==w:  
            return True  
    return False  
      
# generate the dictionary  
def chnsegtager_training():  
    global dic  
    global freqdic  
    global transferdic  
    transfercounter=0  
    wordcounter=0  
    orgname = ''  
    f=file(trainingfilename)  
    while True:  
        line = f.readline()  
        if len(line)==0:  
            break  
        line=line.strip()  
        linewordindex=0  
        prewordproperty=''  
        currwordproperty=''  
        if line.find('/') != -1:  
            lineset = line.split('  ')  
            for w in lineset:#对于一行中每一个字母  
                wordcounter+=1  
                singlewordset = w.split('/')  
                formerword = singlewordset[0]  
                laterword = singlewordset[1]  
                if formerword.find('[') != -1:  #has '[' token  
                    formerword=formerword[1:]  
                    orgname=formerword   
                else:   
                    if orgname != '':   
                        orgname += formerword   
                  
                rbracketpos = laterword.find(']')  
                if rbracketpos != -1:            #has ']' token  
                    orgproperty=laterword[rbracketpos+1:]  
                    laterword=laterword[:rbracketpos]  
                    if dic.has_key(orgname):  
                        if freqdic.has_key(orgname+','+orgproperty):  
                            freqdic[orgname+','+orgproperty]+=1  
                        else:  
                            freqdic[orgname+','+orgproperty]=1  
                        orgvalueStr = dic[orgname]  
                        if findinstr(orgvalueStr,orgproperty)==False:   
                           dic[orgname] += ','+orgproperty   
                           orgname = ''  
                    else:  
                        dic[orgname] = orgproperty  
                        freqdic[orgname+','+orgproperty]=1  
                        orgname = ''  
                      
                if dic.has_key(formerword):#字典里存在此单词  
                    if freqdic.has_key(formerword+','+laterword):  
                        freqdic[formerword+','+laterword]+=1  
                    else:  
                        freqdic[formerword+','+laterword]=1  
                    valueStr = dic[formerword]  
                    if findinstr(valueStr,laterword)==False:  
                        dic[formerword] += ','+laterword  
                else:#字典不存在，创建  
                    dic[formerword] = laterword  
                    freqdic[formerword+','+laterword]=1  
  
                linewordindex+=1  
                if linewordindex==1:  
                    currwordproperty=laterword  
                else:  
                    prewordproperty=currwordproperty  
                    currwordproperty=laterword  
                    transstr=prewordproperty+':'+currwordproperty  
                    if transferdic.has_key(transstr):  
                        transferdic[transstr]+=1  
                        transfercounter+=1  
                    else:  
                        transferdic[transstr]=1  
                        transfercounter+=1  
                      
                linewordindex+=1  
    #统计转移概率  
    for trstr,trnum in transferdic.items():  
        pcl=float(trnum)  
        probability=pcl/transfercounter  
        transferdic[trstr]=probability  
  
    #统计词频  
    for fx,lx in freqdic.items():  
        nlx=float(lx)  
        prox=nlx/wordcounter  
        freqdic[fx]=prox  
                       
    f.close()  
  
#get the file name from command line  
def getArguments():  
    global inputfilename,outputfilename,trainingfilename  
    #sys.argv += ['input.txt','output.txt']  
    if len(sys.argv) < 3:  
        print 'Error:No file specified.(arg1:name of input file; arg2:name of output file; arg3:training file name'  
        sys.exit()  
    inputfilename = sys.argv[1]  
    outputfilename = sys.argv[2]  
    trainingfilename=sys.argv[3]  
  
#get the punctuation marks in the dict  
def getMarksList():  
    global dic,freqdic,marklist  
    for f,l in dic.items():  
        if l=='w':  
            marklist.append(f)  
  
  
#check if the character is a mark  
def checkMark(unicha):  
    for w in marklist:  
        uniw=w.decode('gbk')  
        if uniw==unicha:  
            return True  
    return False  
  
#process single sentance  
def processSentance(unisent,unisymbol):  
    global inputfilename,outputfilename  
    if len(unisent)==0:  
        gbksy = unisymbol.encode('gbk')  
        if gbksy!='' and gbksy!='/n':  
            savefile=file(outputfilename,'a')  
            savefile.write(gbksy+'//w')  
            savefile.write('  ')  
            savefile.close()  
        return  
    global dic,freqdic  
    nodelist=[] #store the word in the the single phrase  
    index=0  
    sentlen=len(unisent)  
    for w in unisent:  
        index+=1 #index is the character behind w  
        #---------add the single word edge----------  
        gbksingleword=w.encode('gbk')  
        if dic.has_key(gbksingleword):  
            newnode=node({gbksingleword:dic[gbksingleword]},[])  
        else:  
            newnode=node({gbksingleword:'unknown'},[])  
          
        if index==sentlen:  #reach the last element  
            nodelist.append(newnode)  
            break  
        #newnode= node({},[])#make a node  
        unistrafterw=unisent[index:]  
        fw=w  
        lw=u''  
        for bw in unistrafterw:  
            lw+=bw  
            uniword=fw+lw  
            gbkword=uniword.encode('gbk')  
            if dic.has_key(gbkword):#create a chain  
                newnode.outpointerdict[gbkword]=dic[gbkword]  
        nodelist.append(newnode)  
  
    lastnode=node({},[])  
    nodelist.append(lastnode)  
  
    #generate the information table for each node  
    nodeindex=0 #the index of the current node  
    for nodeitem in nodelist:   #nodeitem:node  
        if nodeindex==0:  
            nodeindex+=1  
            continue  
        if nodeindex==1:  
            newtableitem=nodeinfotable(1,1,{0:0})  
            nodeitem.tablelist.append(newtableitem)  
            nodeindex+=1  
            continue  
        No=0 #the sequence number of the table item for each node  
        prelist = nodelist[:nodeindex] #nodes before the nodeitem  
        preindex=0  
        for prenode in prelist:#node in before nodeitem  
            for wordname,wordproperty in prenode.outpointerdict.items():  
                uniwordname=wordname.decode('gbk')  
                wordlength=len(uniwordname)  
                if nodeindex==(preindex+wordlength):#exist a pointer to current node  
                    if preindex==0:  
                        No+=1  
                        newtableitem=nodeinfotable(No,1,{0:0})  
                        nodeitem.tablelist.append(newtableitem)  
                    else:  
                        preNo=0 #sequence of tableitem number in the prenode  
                        for pretableitem in prenode.tablelist:#pretableitem:nodeinfotable in the prenode  
                            preNo+=1  
                            No+=1  
                            newtableitem=nodeinfotable(No,pretableitem.length+1,{preindex:preNo})#add the item  
                            nodeitem.tablelist.append(newtableitem)  
            preindex+=1  
        nodeindex+=1  
  
    n=0  
  
    #find all the partition results in nodelist[], and store them into a hashtable  
    pathhashtable={}  
    tailnode=nodelist[-1]  
    tailnodenum=len(nodelist)-1  
    for tableitem in tailnode.tablelist:    #tableitem:single table item of the last node  
        pathlength=tableitem.length #the length of the path  
        for ff,fl in tableitem.previousdict.items():  
            prenodenumber=ff  
        pathstack=[tailnodenum]  
        iteritem=tableitem  
        while prenodenumber!=0: #go until reach the node 0  
            for f,l in iteritem.previousdict.items():  #exactly, there is noly one item in the dict  
                prenodenumber=f #previous node number  
                prenodeitemnumber=l #the number of sequence item of the previous node  
            pathstack.insert(0,prenodenumber)  
            iteritem=nodelist[prenodenumber].tablelist[prenodeitemnumber-1]  
            for f,l in iteritem.previousdict.items():  #update data  
                prenodenumber=f #previous node number  
                prenodeitemnumber=l #the number of sequence item of the previous node  
        pathstack.insert(0,0)  
        pathhashtable[pathlength]=pathstack  
  
    xulie=0  
    for length,path in pathhashtable.items():  
        if xulie==0:  
            minlen=length  
            shortestpath=path  
        else:  
            if length<minlen:  
                minlen=length  
                shortestpath=path  
        xulie+=1  
      
    #generate the result of rough split of the single sentence  
    uniresultlist=[]  
    for n in shortestpath:  
        if n==0:  
            findex=n  
            continue  
        lindex=n  
        oneword=unisent[findex:lindex]  
        findex=lindex  
        uniresultlist.append(oneword)  
  
  
    #check if it is numbers or combination of numbers, or dates,times,plus,  
    unidatetime=[u'年',u'月',u'日',u'时',u'分',u'秒']  
    uniquantifier=[u'亿',u'万',u'千',u'百',u'十',u'个']  
    unioperator=[',','.','+','-','*','%','@',u'％']  
    unidicquantifier=[]  
    for f,l in dic.items():  
        if l.find('q')!=-1:  
            uniqua=f.decode('gbk')  
            if len(uniqua)==1:  
                unidicquantifier.append(uniqua)  
    unisymbollist=unidatetime+uniquantifier+unioperator+unidicquantifier  
  
    qword=''  
    resultindex=0  
    uninewlist=[]  
    for uniword in uniresultlist:  
        if len(uniword)==1:  
            gbkupperword=uniword.encode('gbk')  
            if (isDigit(uniword))or(uniword in unisymbollist)or(gbkupperword.isalpha()):  
                qword+=uniword  
            else:  
                uninewlist.append(qword)  
                uninewlist.append(uniword)  
                gbkw=qword.encode('gbk')  
                if len(gbkw)!=0:  
                    dic[gbkw]='m'  
                    if freqdic.has_key(gbkw+',m'):  
                        freqdic[gbkw+',m']+=1  
                    else:  
                        freqdic[gbkw+',m']=1  
                qword=''  
        else:  
            if qword!='':  
                gbkw=qword.encode('gbk')  
                if len(gbkw)!=0:  
                    dic[gbkw]='m'  
                    if freqdic.has_key(gbkw+',m'):  
                        freqdic[gbkw+',m']+=1  
                    else:  
                        freqdic[gbkw+',m']=1  
                uninewlist.append(qword)  
                qword=''  
            uninewlist.append(uniword)  
        resultindex+=1  
    if len(qword)!=0:  
        uninewlist.append(qword)  
  
    unilastlist=[]  
    for w in uninewlist:  
        if len(w)!=0:  
            unilastlist.append(w)  
  
    #使用viterbi算法进行词性标注  
    vtbwordlist=[]  
    index=-1  
    hmmwordlist=[]  
    for uniword in unilastlist:  
        index+=1  
        word=uniword.encode('gbk')  
        if index==0:  
            #创建首节点  
            if dic.has_key(word):  
                firstpstr=dic[word]  
                firstproplist=firstpstr.split(',')  
                wnode=wordnode(word,[])  
                for p in firstproplist:  
                    #计算符号到当前词的转移概率  
                    if transferdic.has_key('w,'+p):  
                        pronode=propertynode(p,transferdic['w,'+p],[p])  
                    else:  
                        pronode=propertynode(p,1e-10,[p])  
                    wnode.propertynodelist.append(pronode)  
                hmmwordlist.append(wnode)  
            else:  
                #首节点未在字典定义  
                pronode=propertynode('m',1e-10,['m'])  
                wnode=wordnode(word,[pronode])  
                hmmwordlist.append(wnode)  
        else: #Index不为0  
            nowwordnode=wordnode(word,[])  
            #先取得当前结点的属性  
            if dic.has_key(word):  
                prostr=dic[word]  
                prostrlist=prostr.split(',')#当前节点的词性集  
                for p in prostrlist:  
                    #对于每一种属性，由前节点进行最优路径计算  
                    #取得前结点属性  
                    prewordnode=hmmwordlist[index-1]  
                    prepronodelist=prewordnode.propertynodelist  
                    maxprobability=0  
                    maxprenode=propertynode('',1,[])  
                    for pnode in prepronodelist:  
                        #对于前节点的每个词性，进行viterbi计算  
                        prepbt=pnode.probability  
                        preprop=pnode.propertyval  
                        transtr=preprop+':'+p  
                        if transferdic.has_key(transtr):  
                            transprobability=transferdic[transtr]  
                        else:  
                            transprobability=1e-10#不存在此转移，赋一个极小的值  
                        nowprobability=prepbt*transprobability  
                        if nowprobability>maxprobability:  
                            maxprobability=nowprobability  
                            #maxprenode=pnode  
                            maxprenode.propertyval=pnode.propertyval  
                            maxprenode.probability=pnode.probability  
                            maxprenode.beststack=pnode.beststack  
                    nowbeststack=[]  
                    for se in maxprenode.beststack:  
                        nowbeststack.append(se)  
                    nowbeststack.append(p)  
                    pronode=propertynode(p,maxprobability,nowbeststack)  
                    nowwordnode.propertynodelist.append(pronode)  
            else:  
                #字典不存在此词，则将转移概率置为1  
                #取得前结点的所有词性集，然后取最大概率词性，生成当前结点的最优路径  
                prewordnode=hmmwordlist[index-1]  
                prepronodelist=prewordnode.propertynodelist  
                maxprobability=0  
                for pnode in prepronodelist:  
                    prepbt=pnode.probability  
                    if prepbt>maxprobability:  
                        maxprobability=prepbt  
                        maxprenode=pnode  
  
                nowbeststack=[]  
                for se in maxprenode.beststack:  
                    nowbeststack.append(se)  
                nowbeststack.append('nz')  
                pronode=propertynode('nz',maxprobability,nowbeststack)  
                nowwordnode.propertynodelist.append(pronode)  
            hmmwordlist.append(nowwordnode)  
  
    hmmlastword=hmmwordlist[index]  
    maxp=0  
    for l in hmmlastword.propertynodelist:  
        if maxp<l.probability:  
            maxp=l.probability  
            maxn=l  
  
    stackindex=0  
    savefile=file(outputfilename,'a')  
    for w in unilastlist:  
        uniw=w.encode('gbk')  
        if uniw!='' and uniw!='/n':  
            savefile.write(uniw)  
            savefile.write('//'+l.beststack[stackindex])  
            stackindex+=1  
            savefile.write('  ')  
          
          
    gbksymb=unisymbol.encode('gbk')#写标点符号  
    if gbksymb!='' and gbksymb!='/n':  
        savefile.write(gbksymb)  
        savefile.write('//w')  
        savefile.write('  ')  
    savefile.close()  
      
  
              
#check if unistr is a digit  
def isDigit(unistr):  
    if unistr.isdigit():  
        return True  
    unichadigit=u'一二三四五六七八九十'  
    if unistr in unichadigit:  
        return True  
    return False  
      
      
#partition  
def chnsegtager_segtag():  
    global inputfilename,outputfilename  
    global dic,freqdic  
    infile=file(inputfilename)  
    getMarksList()  
    while True:  
        line = infile.readline()  
        if len(line)==0:  
            break  
        uniline=line.decode('gbk')  
        bufstr=u''  
  
        for w in uniline:  
            if checkMark(w)==False:  
                bufstr += w  
            else:  
                processSentance(bufstr,w) #process bufstr as a block  
                bufstr=u''  
        if len(w)!=0:  
            processSentance(bufstr,'/n')  
            bufstr=u''  
              
        savefile=file(outputfilename,'a')  
        savefile.write('/n')  
        savefile.close()  
          
    infile.close()  
  
def clearFile():  
    f = file(outputfilename,'w')  
    f.write('')  
    f.close()  
  
###execution segments:  
getArguments()  
clearFile()  
chnsegtager_training()  
chnsegtager_segtag()
"""